In continuation from Session 1

Arabic <- fromJSON("arabicAyahsSimple.json")
# DATA CLEANUP
Arabic<-Arabic$ayahs
Arabic$id<-NULL
Arabic$edition_id<-NULL
Arabic$page_id<-NULL
Arabic$hizbQuarter_id<-NULL
#Arabic$id<-NULL
str(Arabic)
## 'data.frame':    6236 obs. of  8 variables:
##  $ surat_id     : int  1 1 1 1 1 1 1 2 2 2 ...
##  $ juz_id       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ number       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text         : chr  "بسم الله الرحمن الرحيم" "الحمد لله رب العالمين" "الرحمن الرحيم" "مالك يوم الدين" ...
##  $ numberinsurat: int  1 2 3 4 5 6 7 1 2 3 ...
##  $ manzil_id    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ruku_id      : int  1 1 1 1 1 1 1 2 2 2 ...
##  $ sajda_id     : int  NA NA NA NA NA NA NA NA NA NA ...
Arabic$text[1]
## [1] "بسم الله الرحمن الرحيم"
nchar(Arabic$text[1])
## [1] 23
Arabic$chars<-nchar(Arabic$text)
str(Arabic)
## 'data.frame':    6236 obs. of  9 variables:
##  $ surat_id     : int  1 1 1 1 1 1 1 2 2 2 ...
##  $ juz_id       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ number       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text         : chr  "بسم الله الرحمن الرحيم" "الحمد لله رب العالمين" "الرحمن الرحيم" "مالك يوم الدين" ...
##  $ numberinsurat: int  1 2 3 4 5 6 7 1 2 3 ...
##  $ manzil_id    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ruku_id      : int  1 1 1 1 1 1 1 2 2 2 ...
##  $ sajda_id     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ chars        : int  23 21 13 14 22 21 52 26 37 54 ...
summary(Arabic)
##     surat_id          juz_id          number         text          
##  Min.   :  1.00   Min.   : 1.00   Min.   :   1   Length:6236       
##  1st Qu.: 11.00   1st Qu.:12.00   1st Qu.:1560   Class :character  
##  Median : 26.00   Median :19.00   Median :3118   Mode  :character  
##  Mean   : 33.52   Mean   :18.48   Mean   :3118                     
##  3rd Qu.: 51.00   3rd Qu.:26.00   3rd Qu.:4677                     
##  Max.   :114.00   Max.   :30.00   Max.   :6236                     
##                                                                    
##  numberinsurat      manzil_id        ruku_id         sajda_id   
##  Min.   :  1.00   Min.   :1.000   Min.   :  1.0   Min.   : 1.0  
##  1st Qu.: 16.00   1st Qu.:3.000   1st Qu.:190.0   1st Qu.: 4.5  
##  Median : 38.00   Median :5.000   Median :325.0   Median : 8.0  
##  Mean   : 53.51   Mean   :4.529   Mean   :313.2   Mean   : 8.0  
##  3rd Qu.: 75.00   3rd Qu.:7.000   3rd Qu.:455.0   3rd Qu.:11.5  
##  Max.   :286.00   Max.   :7.000   Max.   :556.0   Max.   :15.0  
##                                                   NA's   :6221  
##      chars       
##  Min.   :  3.00  
##  1st Qu.: 29.00  
##  Median : 54.00  
##  Mean   : 66.39  
##  3rd Qu.: 87.00  
##  Max.   :711.00  
## 
ggplot(data = Arabic, aes(x=surat_id, y = chars,color=manzil_id)) +
  geom_point(alpha=0.3)+
  #geom_line(alpha=0.3)+
  #facet_wrap(facets = vars(manzil_id))+
  #labs(title = "All Ayahs",
   #      x = "Ayat No",
    #     y = "Number of Characters")+
  theme_bw()

#ggplotly(p1)
words<-strsplit(Arabic$text," ")
Arabic$words<-lengths(words)
#summary(words)

ggplot(data = Arabic, aes(x=surat_id, y = words,color=manzil_id)) +
  geom_point(alpha=0.1,aes(size=numberinsurat))+
  #geom_line(alpha=0.3)+
  #facet_wrap(facets = vars(manzil_id))+
  #labs(title = "All Ayahs",
   #      x = "Ayat No",
    #     y = "Number of Characters")+
  theme_bw()

AllWords<-unlist(words)
str(AllWords)
##  chr [1:82823] "بسم" "الله" "الرحمن" "الرحيم" "الحمد" "لله" "رب" "العالمين" ...
concordance<-sort(table(AllWords),decreasing = TRUE)
plot(concordance[1:15])